1. Data preparation

Read data

SE object created from exploratory analysis.

Gene selection according to Biotype already performed

SE_Bio <- readRDS(paste0(params$SE_Bio))

Convert ENSG annotation to Gene Symbol before Differential Expression Analysis and duplicate gene names removal

Duplicated gene names are dropped and gene IDs are set as rownames.

The number of duplicated GeneName is: 11899

The number of duplicated Ensembl Genes with version is: 0

SE_Bio <- SE_Bio[!duplicated(rowData(SE_Bio)$GeneName), ]
rownames(SE_Bio) <- rowData(SE_Bio)$GeneName
SE_Bio
## class: SummarizedExperiment 
## dim: 24708 36 
## metadata(0):
## assays(1): counts
## rownames(24708): TSPAN6 TNMD ... LNCDAT CDR1
## rowData names(9): Gene EnsGene ... Start End
## colnames(36): OLE_001 OLE_002 ... OLE_048 OLE_049
## colData names(11): SampleNumber SampleID_GU ... HRID lib.size
ScaledCols <- c('darkblue', "purple","white","lightgoldenrod1", 'goldenrod1')

padj_threshold = params$padj_threshold
log2fc_threshold = params$log2fc_threshold

Filtering Thresholds are set to:

  • Log2FC = 1.5
  • FDR = 0.05

Sample selection

Only iPSCs are kept

SE_Bio_d25CbO <- SE_Bio[, colData(SE_Bio)$Timepoint %in% 'd0']
colData(SE_Bio_d25CbO)
## DataFrame with 9 rows and 11 columns
##         SampleNumber SampleID_GU SampleID_OG             SampleName  SampleType    Genotype
##            <integer> <character> <character>            <character> <character> <character>
## OLE_001            1    IPS_X_WT    IPS_X_WT iPSCs_CHD2_WT_p63_10..       iPSCs          WT
## OLE_002            2    IPS_X_HT    IPS_X_HT iPSCs_CHD2_HT_40_111..       iPSCs          HT
## OLE_004            3    IPS_K_WT    IPS_K_WT iPSCs_CHD2_WT_p64_04..       iPSCs          WT
## OLE_005            4    IPS_K_HT    IPS_K_HT iPSCs_CHD2_HT_p38_04..       iPSCs          HT
## OLE_026           13     OLE_026        OL01             1_iPSC_WTC       iPSCs          WT
## OLE_027           14     OLE_027        OL02             2_iPSC_WT3       iPSCs          WT
## OLE_028           15     OLE_028        OL03           3_iPSC_A11-5       iPSCs          AR
## OLE_029           16     OLE_029        OL04              4_iPSC_A5       iPSCs          AR
## OLE_030           17     OLE_030        OL05          5_iPSC_HTpull       iPSCs          HT
##           Timepoint       Batch    SeqRun        HRID  lib.size
##         <character> <character> <integer> <character> <numeric>
## OLE_001          d0   iPSCRound         1     OLE_001  18760237
## OLE_002          d0   iPSCRound         1     OLE_002  19271736
## OLE_004          d0   iPSCRound         1     OLE_004  16535067
## OLE_005          d0   iPSCRound         1     OLE_005  21791289
## OLE_026          d0        Evo1         2     OLE_026  23062896
## OLE_027          d0        Evo1         2     OLE_027  21313277
## OLE_028          d0        Evo1         2     OLE_028  23796763
## OLE_029          d0        Evo1         2     OLE_029  22872619
## OLE_030          d0        Evo1         2     OLE_030  21414146

DESeq2

2. Generation of the dds object

  • DDS object is generated from the Count Matrix and Sample Metadata stored in the SE_Bio object
  • Genotype is specified for the design: Heterozygous, Wildtype.
CountMatrix <- assays(SE_Bio_d25CbO)$counts
SampleMeta <- DataFrame(colData(SE_Bio_d25CbO))

all(rownames(SampleMeta) == colnames(CountMatrix))
## [1] TRUE
dds <- DESeqDataSetFromMatrix(countData=assays(SE_Bio_d25CbO)$counts, DataFrame(colData(SE_Bio_d25CbO)), design = ~Batch+Genotype)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in design formula are
## characters, converting to factors

mcols(dds) <- DataFrame(mcols(dds), DataFrame(rowData(SE_Bio_d25CbO)))

dds$Genotype <- factor(dds$Genotype, levels = c("WT", "AR", 'HT')) #no need to specify as column is already ordered, but safer
dds$Genotype <- relevel(dds$Genotype, ref = "WT")


dds$Genotype
## [1] WT HT WT HT WT WT AR AR HT
## Levels: WT AR HT

dds
## class: DESeqDataSet 
## dim: 24708 9 
## metadata(1): version
## assays(1): counts
## rownames(24708): TSPAN6 TNMD ... LNCDAT CDR1
## rowData names(9): Gene EnsGene ... Start End
## colnames(9): OLE_001 OLE_002 ... OLE_029 OLE_030
## colData names(11): SampleNumber SampleID_GU ... HRID lib.size

Inspection of genes with zero counts

ZeroPlot <- CountMatrix %>% 
  mutate(row = row_number()) %>%
  pivot_longer(cols = -row, names_to = "col", values_to = "value") %>% 
  filter(value == 0) %>%
  group_by(col) %>%
  summarise(zerocount = n())  %>%
  ggplot(., aes(y=col, x=zerocount)) +
  geom_col(col='black', fill='#76c8c8') +
  coord_flip() + 
  geom_label(aes(label=zerocount)) + 
  labs(title=paste0('Number of genes with zero counts ', '(out of ', nrow(CountMatrix), ' genes)'),
       y='', x='') +
  theme_bw() +
  theme(axis.text = element_text(colour = 'black', size=7),
        axis.text.x = element_text(angle=45, hjust = 0.5, vjust=0.5),
        plot.title = element_text(hjust = 0.5, size = 7))

ZeroPlot

Filtering of the dds object

  • I focus on protein-coding and lncRNA genes only (other biotypes were already removed from the SE_Bio object)
  • I implement a filter on the expression of at least 5 reads in at least 2 samples
keep <- rowSums(counts(dds)>=5) >= 2 #changed from 5 ##changed from 10 and 2 samples from 3

table(keep)
## keep
## FALSE  TRUE 
##  7727 16981
dds <- dds[keep,]

dds
## class: DESeqDataSet 
## dim: 16981 9 
## metadata(1): version
## assays(1): counts
## rownames(16981): TSPAN6 TNMD ... C8orf44 PDCD6-AHRR
## rowData names(9): Gene EnsGene ... Start End
## colnames(9): OLE_001 OLE_002 ... OLE_029 OLE_030
## colData names(11): SampleNumber SampleID_GU ... HRID lib.size

A dds object containing information about 16981 genes in 9 samples is tested for differential expression.


3. Differential Expression

dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
sizeFactors(dds)
##   OLE_001   OLE_002   OLE_004   OLE_005   OLE_026   OLE_027   OLE_028   OLE_029   OLE_030 
## 0.8770382 0.8787711 0.7744089 0.9846501 1.1871144 1.0970942 1.1832126 1.1512840 1.0245872

Inspection of the dds object

Top50 genes heatmap

select <- order(rowMeans(counts(dds,normalized=TRUE)),
                decreasing=TRUE)[1:50]
ntd <- normTransform(dds)
df <- as.data.frame(colData(dds)[,c("Genotype")])
colnames(df) <- 'Genotype'
#df$Run <- c(rep('rep1', 3), rep('rep2', 3))

rownames(df) <- colnames(ntd)

pheatmap(assay(ntd)[select,], cluster_rows=FALSE, show_rownames=FALSE,
         cluster_cols=TRUE, annotation_col=df, border_color = 'black')

Samples distances

vsd <- vst(dds, blind=FALSE)
sampleDists <- dist(t(assay(vsd)))

sampleDistMatrix <- as.matrix(sampleDists)
#rownames(sampleDistMatrix) <- paste(vsd$condition, vsd$type, sep="-")
#colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette(rev(RColorBrewer::brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         col=colors, main = 'Samples Distances', name = 'vst')

Counts Distribution

SF <- data.frame(Sample=names(sizeFactors(dds)), SizeF=sizeFactors(dds))

par(mfrow=c(2,2),cex.lab=0.7)
boxplot(log2(counts(dds)),  col=ScaledCols, cex.axis=0.7, 
        las=1, xlab="log2(counts)", horizontal=TRUE, main="Raw counts")
boxplot(log2(counts(dds, normalized=TRUE)),  col=ScaledCols, cex.axis=0.7, 
        las=1, xlab="log2(normalized counts)", horizontal=TRUE, main="Normalized counts") 
plotDensity(log2(counts(dds)),  col=ScaledCols, 
            xlab="log2(counts)", cex.lab=0.7, panel.first=grid()) 
plotDensity(log2(counts(dds, normalized=TRUE)), col=ScaledCols, 
            xlab="log2(normalized counts)", cex.lab=0.7, panel.first=grid()) 

Dispersion estimate

plotDispEsts(dds)


Extract Result of contrast: HT vs WT

res_dds_ht <- results(dds, contrast=c("Genotype","HT","WT"), alpha=0.05, cooksCutoff=0.99)
summary(res_dds_ht)
## 
## out of 16981 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 1976, 12%
## LFC < 0 (down)     : 1673, 9.9%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

metadata(res_dds_ht)$filterThreshold
##       0% 
## 1.060529
metadata(res_dds_ht)$alpha
## [1] 0.05

plot(metadata(res_dds_ht)$filterNumRej, 
     type="b", ylab="number of rejections",
     xlab="quantiles of filter", main='Heterozygous',
     cex.lab=0.5, cex.axis=0.5, cex.main=0.5)
lines(metadata(res_dds_ht)$lo.fit, col="red")
abline(v=metadata(res_dds_ht)$filterTheta)

Top results from res table sorted by adjusted Pvalue for HT vs WT

head(res_dds_ht[order(res_dds_ht$padj),])
## log2 fold change (MLE): Genotype HT vs WT 
## Wald test p-value: Genotype HT vs WT 
## DataFrame with 6 rows and 6 columns
##         baseMean log2FoldChange     lfcSE      stat       pvalue         padj
##        <numeric>      <numeric> <numeric> <numeric>    <numeric>    <numeric>
## PDGFA   1371.073        3.08521  0.132934   23.2086 3.72933e-119 6.33277e-115
## UTF1     620.684        5.44024  0.247564   21.9751 4.98608e-107 4.23343e-103
## ECEL1    369.708        3.78976  0.182203   20.7997  4.35644e-96  2.46589e-92
## ZNF562   396.203       -3.43119  0.179106  -19.1574  8.40246e-82  3.56705e-78
## VENTX    411.739        3.83143  0.203351   18.8414  3.45413e-79  1.17309e-75
## MLLT6   1254.439        2.08808  0.119621   17.4557  3.11301e-68  8.81035e-65

  • Genes modulated considering a FDR threshold of 0.1: 4550

  • Genes modulated considering a FDR threshold of 0.05: 3649

  • Genes modulated considering a FDR threshold of 0.05 and FC threshold of 1.5: 2574

  • Genes modulated considering a FDR threshold of 0.05 and FC threshold of 2: 1639


Fold-change shrinkage

Since I am using the constrast option to retrieve the results, I cannot rely on apleglm default algorithm for logFC shrinkage. Since at the moment I am not using the lfcThreshold option, I decide for the ashr algorithm.

resAshr_ht <- lfcShrink(dds, contrast=c("Genotype","HT","WT"), res=res_dds_ht, type="ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
summary(resAshr_ht)
## 
## out of 16981 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 1976, 12%
## LFC < 0 (down)     : 1673, 9.9%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

Heterozygous

#par(mfrow=c(1,2), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-6,6)
DESeq2::plotMA(res_dds_ht, xlim=xlim, ylim=ylim, main="no LFC shrink")

DESeq2::plotMA(resAshr_ht, xlim=xlim, ylim=ylim, main="LFC shrink with ashr algorithm")


Top results from res table sorted by adjpvalue after LFC shrinkage

head(resAshr_ht[order(resAshr_ht$padj),])
## log2 fold change (MMSE): Genotype HT vs WT 
## Wald test p-value: Genotype HT vs WT 
## DataFrame with 6 rows and 5 columns
##         baseMean log2FoldChange     lfcSE       pvalue         padj
##        <numeric>      <numeric> <numeric>    <numeric>    <numeric>
## PDGFA   1371.073        3.06714  0.133478 3.72933e-119 6.33277e-115
## UTF1     620.684        5.41367  0.247868 4.98608e-107 4.23343e-103
## ECEL1    369.708        3.76642  0.183406  4.35644e-96  2.46589e-92
## ZNF562   396.203       -3.40353  0.180476  8.40246e-82  3.56705e-78
## VENTX    411.739        3.80289  0.204973  3.45413e-79  1.17309e-75
## MLLT6   1254.439        2.06906  0.119762  3.11301e-68  8.81035e-65

  • Genes modulated considering a FDR threshold of 0.1: 4550

  • Genes modulated considering a FDR threshold of 0.05: 3649

  • Genes modulated considering a FDR threshold of 0.05 and FC threshold of 1.5: 2164

  • Genes modulated considering a FDR threshold of 0.05 and FC threshold of 2: 1340


Log Fold change shrinkage doesn’t change the results significantly, thus I decided to keep the standard DESeq2 workflow for testing differential gene expression.


Extract DEGs

  • Here I extract the DEGs passing adjusted Pvalue and logFC thresholds, for HT vs WT contrast (deseqDEGsHT)
deseqDEGsHT <- dplyr::filter(data.frame(res_dds_ht), padj < padj_threshold, abs(log2FoldChange) > log2(log2fc_threshold))

deseqDEGsHTashr <- dplyr::filter(data.frame(resAshr_ht), padj < padj_threshold, abs(log2FoldChange) > log2(log2fc_threshold))

Results Visualization

SE_deseq2 <- as(dds, "RangedSummarizedExperiment")
assays(SE_deseq2)$vst <- assay(vst(dds, blind=TRUE))
FdrCeil = 1e-10
logFcCeil = 7.5

Plotting Ceilings are set to:

  • Log2FC 7.5
  • FDR 10^{-10}

Volcano

Heterozygous

#rename(as.data.frame(res_dds_ht), logFC = log2FoldChange, FDR = padj) %>% VolcanoTiltedNodash(. , AdjustedCutoff = 0.05, LabellingCutoff = 0.01, FCCutoff = 2, main = 'HT vs WT')

dplyr::rename(as.data.frame(res_dds_ht), logFC = log2FoldChange, FDR = padj) %>% VolcanoTiltedNodash(. , AdjustedCutoff = padj_threshold, LabellingCutoff = 0.01, FCCutoff = log2fc_threshold, main = 'HT vs WT (iPSCs CO)') + labs(y='Log FoldChange') + xlim(0, -log10(FdrCeil)) + ylim(-logFcCeil, logFcCeil)

Heterozygous LFC shrink

#rename(as.data.frame(res_dds_ht), logFC = log2FoldChange, FDR = padj) %>% VolcanoTiltedNodash(. , AdjustedCutoff = 0.05, LabellingCutoff = 0.01, FCCutoff = 2, main = 'HT vs WT')

dplyr::rename(as.data.frame(resAshr_ht), logFC = log2FoldChange, FDR = padj) %>% VolcanoTiltedNodash(. , AdjustedCutoff = padj_threshold, LabellingCutoff = 0.01, FCCutoff = log2fc_threshold, main = 'HT vs WT (iPSCs CO)') + labs(y='Log FoldChange') + xlim(0, -log10(FdrCeil)) + ylim(-logFcCeil, logFcCeil)


9. Savings

DEAList <- list()

DEAList <- list(dds = dds,  #same for both
                HT = list(res = res_dds_ht, 
                          DEGs = deseqDEGsHT,
                          resAshr =resAshr_ht,
                          DEGsAshr = deseqDEGsHTashr))
# RDS
saveRDS(DEAList, paste0(SavingFolder, '/ipsc.', 'DEAList_HT.rds')) 

saveRDS(SE_deseq2, paste0(SavingFolder, '/ipsc.', 'SE_deseq2_HT.rds')) 

saveRDS(res_dds_ht, paste0(SavingFolder, '/ipsc.', 'deseqHTvsWT.rds')) 

SessionInfo <- sessionInfo()
Date <- date()
save.image(paste0(SavingFolder, '/ipsc.', 'DEAnalysisWorkspace_HT.RData'))

last update on: Fri Jan 10 18:49:53 2025